Read data from the tab delimited file of gene expression genereated in Galaxy

counts = read.table("Genes_per_sample_featurecounts.tabular", header=TRUE, row.names=1)
head(counts)
##             SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## NM_000014.5          0          0          0          0          0          0
## NM_000015.3          6          4          4          2          6          4
## NM_000016.5          0          0          0          0          0          0
## NM_000017.4         44         29         27         16         32         16
## NM_000018.4          0          0          0          0          0          0
## NM_000019.4       1465       2990       2323       4505       5209       8137
counts_filtered = counts[rowSums(counts) > 0,]
colSums(counts_filtered)
## SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541 
##   12808691   16532499   16128222   26936762   31970026   33466916

##Make plots of the raw data by sample

library(graphics)
plot(counts[,1], ylab=colnames(counts)[1])

plot(counts[,2], ylab=colnames(counts)[2])

plot(counts[,3], ylab=colnames(counts)[3])

plot(counts[,4], ylab=colnames(counts)[4])

plot(counts[,5], ylab=colnames(counts)[5])

plot(counts[,6], ylab=colnames(counts)[6])

Make plots of the data filtered to exclude zero count genes

plot(counts_filtered[,1], ylab=colnames(counts_filtered)[1])

plot(counts_filtered[,2], ylab=colnames(counts_filtered)[2])

plot(counts_filtered[,3], ylab=colnames(counts_filtered)[3])

plot(counts_filtered[,4], ylab=colnames(counts_filtered)[4])

plot(counts_filtered[,5], ylab=colnames(counts_filtered)[5])

plot(counts_filtered[,6], ylab=colnames(counts_filtered)[6])

##Make plots of the log2 transformed data

plot(log2(counts_filtered[,1]+1), ylab=colnames(counts_filtered)[1])

plot(log2(counts_filtered[,2]+1), ylab=colnames(counts_filtered)[2])

plot(log2(counts_filtered[,3]+1), ylab=colnames(counts_filtered)[3])

plot(log2(counts_filtered[,4]+1), ylab=colnames(counts_filtered)[4])

plot(log2(counts_filtered[,5]+1), ylab=colnames(counts_filtered)[5])

plot(log2(counts_filtered[,6]+1), ylab=colnames(counts_filtered)[6])

##Make boxplots of the counts filtered and log2 transformed

par(mar=c(7,5,1,1))
boxplot(counts_filtered, las=2, main="filtered data")

boxplot(log2(counts_filtered+1), las=2, main="log2 transformed data")

##Read phenotype data downloaded from NCBI and add a column for age group

phenotype = read.table("phenotype_data_capstone_genomics_coursera.txt", header=TRUE, dec=".")
phenotype["age_group"]=as.factor(c("adult","adult", "adult", "fetal", "fetal", "fetal"))

##Annotate gene names

library(annotate)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: XML
library(org.Hs.eg.db)
## 
org.Hs.eg()
## Quality control information for org.Hs.eg:
## 
## 
## This package has the following mappings:
## 
## org.Hs.egACCNUM has 40109 mapped keys (of 61547 keys)
## org.Hs.egACCNUM2EG has 794946 mapped keys (of 794946 keys)
## org.Hs.egALIAS2EG has 125573 mapped keys (of 125573 keys)
## org.Hs.egCHR has 61403 mapped keys (of 61547 keys)
## org.Hs.egCHRLENGTHS has 595 mapped keys (of 595 keys)
## org.Hs.egCHRLOC has 28306 mapped keys (of 61547 keys)
## org.Hs.egCHRLOCEND has 28306 mapped keys (of 61547 keys)
## org.Hs.egENSEMBL has 35228 mapped keys (of 61547 keys)
## org.Hs.egENSEMBL2EG has 38282 mapped keys (of 38282 keys)
## org.Hs.egENSEMBLPROT has 7004 mapped keys (of 61547 keys)
## org.Hs.egENSEMBLPROT2EG has 21951 mapped keys (of 21951 keys)
## org.Hs.egENSEMBLTRANS has 13080 mapped keys (of 61547 keys)
## org.Hs.egENSEMBLTRANS2EG has 39166 mapped keys (of 39166 keys)
## org.Hs.egENZYME has 2229 mapped keys (of 61547 keys)
## org.Hs.egENZYME2EG has 975 mapped keys (of 975 keys)
## org.Hs.egGENENAME has 61547 mapped keys (of 61547 keys)
## org.Hs.egGO has 20692 mapped keys (of 61547 keys)
## org.Hs.egGO2ALLEGS has 22780 mapped keys (of 22780 keys)
## org.Hs.egGO2EG has 18348 mapped keys (of 18348 keys)
## org.Hs.egMAP has 59023 mapped keys (of 61547 keys)
## org.Hs.egMAP2EG has 2002 mapped keys (of 2002 keys)
## org.Hs.egOMIM has 16208 mapped keys (of 61547 keys)
## org.Hs.egOMIM2EG has 21951 mapped keys (of 21951 keys)
## org.Hs.egPATH has 5868 mapped keys (of 61547 keys)
## org.Hs.egPATH2EG has 229 mapped keys (of 229 keys)
## org.Hs.egPMID has 38673 mapped keys (of 61547 keys)
## org.Hs.egPMID2EG has 667264 mapped keys (of 667264 keys)
## org.Hs.egREFSEQ has 38808 mapped keys (of 61547 keys)
## org.Hs.egREFSEQ2EG has 281994 mapped keys (of 281994 keys)
## org.Hs.egSYMBOL has 61547 mapped keys (of 61547 keys)
## org.Hs.egSYMBOL2EG has 61538 mapped keys (of 61538 keys)
## org.Hs.egUCSCKG has 27348 mapped keys (of 61547 keys)
## org.Hs.egUNIGENE has 26001 mapped keys (of 61547 keys)
## org.Hs.egUNIGENE2EG has 29207 mapped keys (of 29207 keys)
## org.Hs.egUNIPROT has 19082 mapped keys (of 61547 keys)
## 
## 
## Additional Information about this package:
## 
## DB schema: HUMAN_DB
## DB schema version: 2.1
## Organism: Homo sapiens
## Date for NCBI data: 2020-Sep23
## Date for GO data: 2020-09-10
## Date for KEGG data: 2011-Mar15
## Date for Golden Path data: 2020-Aug27
## Date for Ensembl data: 2020-Aug18
select(org.Hs.eg.db, keys="NM_001001578", columns = "SYMBOL", keytype = "ACCNUM")
## 'select()' returned 1:1 mapping between keys and columns
##         ACCNUM SYMBOL
## 1 NM_001001578  PDE9A
names  = substr(rownames(counts), 1, nchar(rownames(counts))-2)
rownames(counts) = names
gen_symbols = select(org.Hs.eg.db, keys=names, columns = "SYMBOL", keytype = "ACCNUM")
## 'select()' returned 1:1 mapping between keys and columns
counts["symbol"] = gen_symbols[,2]

##Further filter the data to include only genes with counts higher than 10

counts_filtered10 = counts[rowSums(counts[1:6]) > 10,]

##Generate expression counts by gene

counts_bygene = aggregate(counts_filtered10[,1:6], by=list(counts_filtered10[,"symbol"]), sum)
rownames(counts_bygene) = counts_bygene[,1]
counts_bygene$Group.1 <- NULL
head(counts_bygene)
##          SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## A1BG            246        248        185        445        248        464
## A1BG-AS1         96         54         65        101         47        122
## A2ML1            43         26         17         19         22          9
## A2MP1             1         11         14         12         13         13
## A3GALT2           6          5          2         13         28          8
## A4GNT             1          4          6          0          9         19

##Generate boxplots of counts by gene

par(mar=c(7,5,1,1))
boxplot(counts_bygene, las=2, ylab = "counts")

##Log2 transform the counts by sample

l2_counts = log2(counts_bygene+1)

##Generate a Summarized Experiment object

library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.0.4
phenotype[,4] = as.factor(phenotype[,4])
Cdata = SummarizedExperiment(assays=counts_bygene, colData=phenotype)

##Generate objects for the DSeq2 package analysis using transformed and untransformed data Generate model with all factors for transformed and untransformed data, and a simple model including only age group for the transformed data

library(DESeq2)
dseq = DESeqDataSetFromMatrix(assay(Cdata), colData(Cdata), ~ age_group+sex+RIN)
##   the design formula contains one or more numeric variables that have mean or
##   standard deviation larger than 5 (an arbitrary threshold to trigger this message).
##   it is generally a good idea to center and scale numeric variables in the design
##   to improve GLM convergence.
dseq_not_transformed = DESeqDataSetFromMatrix(assay(Cdata), colData(Cdata), ~ age_group+sex+RIN)
##   the design formula contains one or more numeric variables that have mean or
##   standard deviation larger than 5 (an arbitrary threshold to trigger this message).
##   it is generally a good idea to center and scale numeric variables in the design
##   to improve GLM convergence.
dseq_age = DESeqDataSetFromMatrix(assay(Cdata), colData(Cdata), ~ age_group)

##Normalize the data and perform PCA of DSeq2 object

dseq_norm = varianceStabilizingTransformation(dseq)
count_pca = prcomp(assay(dseq_norm), center=TRUE, scale=TRUE)
pca = princomp(assay(dseq_norm), cor=TRUE)

##Generate plots of the PCA using ggplot First, transform the count_pca object to a dataframe to use in ggplot, then generate the plots

library(ggplot2)
dat = data.frame(X=count_pca$rotation[,1], Y=count_pca$rotation[,2], age_group=phenotype$age_group, RIN=phenotype$RIN, SEX=phenotype$sex)
boxplot(log2(counts_bygene+1), las=2, ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))

boxplot(assay(dseq_norm), las=2, ylab = "transformed by variance stabilizing", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))

plot(pca)

##Run DESeq2 analysis for transformed and untransformed data Run DESeq2 analysis for transformed and untransformed data with all variables as response, and with only age group as response for the transformed data, to compare the number of differentially expressed genes with a p-adjusted value lower than 0.05

dseq_nottransformed_results = DESeq(dseq_not_transformed)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dseq_results = DESeq(dseq)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dseq_age_results = DESeq(dseq_age)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sum(results(dseq_nottransformed_results)$padj < 0.05, na.rm=TRUE)
## [1] 85
sum(results(dseq_results)$padj < 0.05, na.rm=TRUE)
## [1] 85
sum(results(dseq_age_results)$padj < 0.05, na.rm=TRUE)
## [1] 6894

##Extract and log transform data for limma analysis

edata = assay(Cdata)
head(edata)
##          SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## A1BG            246        248        185        445        248        464
## A1BG-AS1         96         54         65        101         47        122
## A2ML1            43         26         17         19         22          9
## A2MP1             1         11         14         12         13         13
## A3GALT2           6          5          2         13         28          8
## A4GNT             1          4          6          0          9         19
edata_lg2 = log2(edata + 1)
head(edata_lg2)
##          SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## A1BG       7.948367   7.960002   7.539159   8.800900   7.960002   8.861087
## A1BG-AS1   6.599913   5.781360   6.044394   6.672425   5.584963   6.942515
## A2ML1      5.459432   4.754888   4.169925   4.321928   4.523562   3.321928
## A2MP1      1.000000   3.584963   3.906891   3.700440   3.807355   3.807355
## A3GALT2    2.807355   2.584963   1.584963   3.807355   4.857981   3.169925
## A4GNT      1.000000   2.321928   2.807355   0.000000   3.321928   4.321928
dim(edata_lg2)
## [1] 17550     6
edata_lg2_filt = edata_lg2[rowMeans(edata_lg2) > 10, ]
dim(edata_lg2_filt)
## [1] 3199    6

##Run limma analysis

library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(edgeR)
model = model.matrix(~ Cdata$age_group)
fit_limma = lmFit(edata_lg2_filt, model)
ebayes = eBayes(fit_limma)
ebayes
## An object of class "MArrayLM"
## $coefficients
##          (Intercept) Cdata$age_groupfetal
## AARS1       14.23026           -0.5186347
## AARS2       10.39848            1.1021997
## AASDHPPT    12.49400            1.5562765
## AASS        10.01977            1.1593046
## AATF        10.28662            1.1379707
## 3194 more rows ...
## 
## $rank
## [1] 2
## 
## $assign
## [1] 0 1
## 
## $qr
## $qr
##   (Intercept) Cdata$age_groupfetal
## 1  -2.4494897           -1.2247449
## 2   0.4082483            1.2247449
## 3   0.4082483            0.2898979
## 4   0.4082483           -0.5265986
## 5   0.4082483           -0.5265986
## 6   0.4082483           -0.5265986
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`Cdata$age_group`
## [1] "contr.treatment"
## 
## 
## $qraux
## [1] 1.408248 1.289898
## 
## $pivot
## [1] 1 2
## 
## $tol
## [1] 1e-07
## 
## $rank
## [1] 2
## 
## 
## $df.residual
## [1] 4 4 4 4 4
## 3194 more elements ...
## 
## $sigma
##     AARS1     AARS2  AASDHPPT      AASS      AATF 
## 0.1460217 0.1973929 0.7993312 0.8863305 0.2548009 
## 3194 more elements ...
## 
## $cov.coefficients
##                      (Intercept) Cdata$age_groupfetal
## (Intercept)            0.3333333           -0.3333333
## Cdata$age_groupfetal  -0.3333333            0.6666667
## 
## $stdev.unscaled
##          (Intercept) Cdata$age_groupfetal
## AARS1      0.5773503            0.8164966
## AARS2      0.5773503            0.8164966
## AASDHPPT   0.5773503            0.8164966
## AASS       0.5773503            0.8164966
## AATF       0.5773503            0.8164966
## 3194 more rows ...
## 
## $pivot
## [1] 1 2
## 
## $Amean
##    AARS1    AARS2 AASDHPPT     AASS     AATF 
## 13.97094 10.94958 13.27213 10.59942 10.85561 
## 3194 more elements ...
## 
## $method
## [1] "ls"
## 
## $design
##   (Intercept) Cdata$age_groupfetal
## 1           1                    0
## 2           1                    0
## 3           1                    0
## 4           1                    1
## 5           1                    1
## 6           1                    1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`Cdata$age_group`
## [1] "contr.treatment"
## 
## 
## $df.prior
## [1] 4.238586
## 
## $s2.prior
## [1] 0.1275318
## 
## $var.prior
## [1] 125.4589 121.1807
## 
## $proportion
## [1] 0.01
## 
## $s2.post
##      AARS1      AARS2   AASDHPPT       AASS       AATF 
## 0.07596496 0.08453032 0.37582612 0.44702837 0.09713421 
## 3194 more elements ...
## 
## $t
##          (Intercept) Cdata$age_groupfetal
## AARS1       89.42659            -2.304626
## AARS2       61.94758             4.643010
## AASDHPPT    35.29950             3.109130
## AASS        25.95677             2.123614
## AATF        57.16725             4.471885
## 3194 more rows ...
## 
## $df.total
## [1] 8.238586 8.238586 8.238586 8.238586 8.238586
## 3194 more elements ...
## 
## $p.value
##           (Intercept) Cdata$age_groupfetal
## AARS1    1.332123e-13          0.049201073
## AARS2    2.731016e-12          0.001535283
## AASDHPPT 2.763568e-10          0.013951233
## AASS     3.407894e-09          0.065462632
## AATF     5.285168e-12          0.001931586
## 3194 more rows ...
## 
## $lods
##          (Intercept) Cdata$age_groupfetal
## AARS1       18.33334            -4.917174
## AARS2       17.11436            -1.326569
## AASDHPPT    14.09566            -3.642994
## AASS        11.93187            -5.196401
## AATF        16.76922            -1.570223
## 3194 more rows ...
## 
## $F
## [1] 7710.9646 4265.8205 1410.9320  756.2181 3649.6299
## 3194 more elements ...
## 
## $F.p.value
## [1] 3.307805e-14 3.783242e-13 3.578336e-11 4.623020e-10 7.189030e-13
## 3194 more elements ...
limma_table = topTable(ebayes, number=length(rownames(edata_lg2_filt)))
## Removing intercept from test coefficients
head(limma_table)
##             logFC  AveExpr         t      P.Value    adj.P.Val        B
## SOX11    9.678600 12.27098  40.10630 9.706908e-11 3.105240e-07 13.83080
## DRAXIN   7.861109 11.45553  27.91827 1.881400e-09 3.009299e-06 11.93896
## SOX4     7.628643 12.74369  24.82583 4.898471e-09 4.587674e-06 11.20921
## GABRD   -5.668479 10.00311 -24.34880 5.736385e-09 4.587674e-06 11.08372
## MEX3B    5.224093 11.39089  22.64248 1.035470e-08 5.720225e-06 10.60233
## BHLHE22  5.893903 11.60899  22.46396 1.104209e-08 5.720225e-06 10.54883
limma_table_output = limma_table[,c(1,4,5)]
head(limma_table_output)
##             logFC      P.Value    adj.P.Val
## SOX11    9.678600 9.706908e-11 3.105240e-07
## DRAXIN   7.861109 1.881400e-09 3.009299e-06
## SOX4     7.628643 4.898471e-09 4.587674e-06
## GABRD   -5.668479 5.736385e-09 4.587674e-06
## MEX3B    5.224093 1.035470e-08 5.720225e-06
## BHLHE22  5.893903 1.104209e-08 5.720225e-06

##Save tab delimited file of all differentially expressed genes

write.table(limma_table_output, file="genes_dif_expr.txt", sep="\t", row.names = TRUE, col.names = TRUE)

##Generates Volcano plot

par(mar=c(5,5,1,1))
with(limma_table, plot(logFC, -log10(adj.P.Val), pch=20, main="Volcano plot"))
with(subset(limma_table, adj.P.Val < 0.05), points(logFC, -log10(adj.P.Val), pch=20, col="red"))

##Filter differentially expressed genes by adjusted values of p lower than 0.05 Filter the differentially expressed genes by adjusted p values and generates objects for upregulated and downregulated genes

dif_expr = limma_table_output[limma_table_output$adj.P.Val < 0.05,]
downreg = limma_table_output[limma_table_output$adj.P.Val < 0.05 & limma_table_output$logFC > 1,]
upreg = limma_table_output[limma_table_output$adj.P.Val < 0.05 & limma_table_output$logFC < -1,]
dim(dif_expr)
## [1] 2261    3
dim(downreg)
## [1] 1723    3
dim(upreg)
## [1] 205   3

##Generates boxplots of differentially expressed genes

edata_l2filt_difexp = edata_lg2_filt[rownames(dif_expr),]
edata_l2filt_downreg = edata_lg2_filt[rownames(downreg),]
edata_l2filt_upreg = edata_lg2_filt[rownames(upreg),]
par(mar=c(8,5,1,1))
boxplot(edata_l2filt_difexp, las=2, main="diferential expression by sample", ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))

boxplot(edata_l2filt_downreg, las=2, main="downregulated genes by sample", ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))

boxplot(edata_l2filt_upreg, las=2, main="upregulated genes by sample", ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))

##Generate dispersion plot of the total number of up and downregulated genes by sample

library(ggrepel)
up_down = data.frame(colSums(edata_l2filt_upreg), colSums(edata_l2filt_downreg))
colnames(up_down) = c("upregulated genes [log2(counts+1)]", "downregulated genes [log2(counts+1)]")
row.names(up_down) = c("adult - SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541")
up_down_plot = ggplot(up_down, aes(x= `upregulated genes [log2(counts+1)]`, y=`downregulated genes [log2(counts+1)]`))+geom_point(size=5)
up_down_plot + geom_label_repel(aes(label=row.names(up_down)), size=3)

##Get methylation narrow peaks for H3K4me3 using AnnotationHub, to compare with the diffentially expressed genes

library(AnnotationHub)
## Loading required package: BiocFileCache
## Loading required package: dbplyr
## 
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:Biobase':
## 
##     cache
ah <- AnnotationHub()
## snapshotDate(): 2020-10-27
ah <- subset(ah, species == "Homo sapiens")
ah_fetal <- query(ah, c("EpigenomeRoadMap", "H3K4me3"))
fetal_met <- ah_fetal[["AH30471"]]
## loading from cache
## require("rtracklayer")
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
ah_adult <- query(ah, c("EpigenomeRoadMap", "H3K4me3"))
adult_met <- ah_adult[["AH30413"]]
## loading from cache
ah_liver <- query(ah, c("EpigenomeRoadMap", "H3K4me3"))
liver_met <- ah_liver[["AH30367"]]
## loading from cache

##Get gene Symbols from limma results and convert to Entrez ID

library(mygene)
## Loading required package: GenomicFeatures
## Warning: package 'GenomicFeatures' was built under R version 4.0.4
## 
## Attaching package: 'mygene'
## The following object is masked from 'package:AnnotationHub':
## 
##     query
dif_exp_genes = row.names(limma_table[limma_table$adj.P.Val < 0.05,])
dif_exp_gene_ids = queryMany(dif_exp_genes, scopes = "symbol", fields = "entrezgene", species = "human" )
## Querying chunk 1
## Querying chunk 2
## Querying chunk 3
## Finished
## Pass returnall=TRUE to return lists of duplicate or missing query terms.
head(dif_exp_gene_ids)
## DataFrame with 6 rows and 5 columns
##         query         _id   X_score  entrezgene  notfound
##   <character> <character> <numeric> <character> <logical>
## 1       SOX11        6664   89.6143        6664        NA
## 2      DRAXIN      374946   86.8990      374946        NA
## 3        SOX4        6659   88.9639        6659        NA
## 4       GABRD        2563   86.0339        2563        NA
## 5       MEX3B       84206   86.5313       84206        NA
## 6     BHLHE22       27319   86.7396       27319        NA
dif_exp_gene_entrez = na.omit(dif_exp_gene_ids$entrezgene)

##Generate intervals of genes plus promoters for the differentially expressed genes

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb_genes <- genes(txdb)
##   403 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.
dif_exp_promoters <- promoters(txdb_genes[dif_exp_gene_entrez %in% txdb_genes$gene_id])
dif_exp_prom = promoters(txdb_genes[txdb_genes$gene_id %in% dif_exp_gene_ids$entrezgene])

####Get the overlaping intervals between all genes/promoters and methylation marks

fetal_ov_promoters = subsetByOverlaps(fetal_met, dif_exp_promoters)
adult_ov_promoters = subsetByOverlaps(adult_met, dif_exp_promoters)
liver_ov_promoters = subsetByOverlaps(liver_met, dif_exp_promoters)

##Get the overlaping intervals between differentially expressed genes/promoters and methylation marks

adult_ov_prom = subsetByOverlaps(adult_met, dif_exp_prom)
fetal_ov_prom = subsetByOverlaps(fetal_met, dif_exp_prom)
liver_ov_prom = subsetByOverlaps(liver_met, dif_exp_prom)

##Generate Venn Diagrams and hypergeometric test between the overlaping intervals

library(ChIPpeakAnno)
## 
## Attaching package: 'ChIPpeakAnno'
## The following object is masked from 'package:annotate':
## 
##     getGO
met_overlaps = findOverlapsOfPeaks(fetal_met, adult_met, liver_met, connectedPeaks = "merge")
## duplicated or NA names found. 
##                 Rename all the names by numbers.
## duplicated or NA names found. 
##                 Rename all the names by numbers.
## duplicated or NA names found. 
##                 Rename all the names by numbers.
peaks_prom_overlaps = findOverlapsOfPeaks(fetal_ov_prom, adult_ov_prom, liver_ov_prom, connectedPeaks = "merge")
## duplicated or NA names found. 
##                 Rename all the names by numbers.
## duplicated or NA names found. 
##                 Rename all the names by numbers.
## duplicated or NA names found. 
##                 Rename all the names by numbers.
vennDiag_all_met <- makeVennDiagram(met_overlaps)
## Missing totalTest! totalTest is required for HyperG test. 
## If totalTest is missing, pvalue will be calculated by estimating 
## the total binding sites of encoding region of human.
## totalTest = humanGenomeSize * (2%(codingDNA) + 
##              1%(regulationRegion)) / ( 2 * averagePeakWidth )
##           = 3.3e+9 * 0.03 / ( 2 * averagePeakWidth)
##           = 5e+7 /averagePeakWidth

HGtest <- makeVennDiagram(peaks_prom_overlaps, totalTest = 3500)

HGtest$p.value
##      fetal_ov_prom adult_ov_prom liver_ov_prom          pval
## [1,]             0             1             1 6.049833e-117
## [2,]             1             0             1 4.562069e-166
## [3,]             1             1             0 2.923927e-127
HGtest$vennCounts
##      fetal_ov_prom adult_ov_prom liver_ov_prom Counts
## [1,]             0             0             0    338
## [2,]             0             0             1    163
## [3,]             0             1             0    459
## [4,]             0             1             1    733
## [5,]             1             0             0     37
## [6,]             1             0             1      2
## [7,]             1             1             0     98
## [8,]             1             1             1   1670
## attr(,"class")
## [1] "VennCounts"

##Turn Venn counts into a matrix

m = matrix(nrow=7, ncol=3)
m[1,]=c(0, 0, 238)
m[2,]=c(0, 442, 0)
m[3,]=c(0, 718, 718)
m[4,]=c(37, 0, 0)
m[5,]=c(2, 0, 2)
m[6,]=c(96, 96, 0)
m[7,]=c(1619, 1619, 1619)

##Perform a Pearson’s Chi-squared test

chisq.test(m, rescale.p = TRUE, simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  m
## X-squared = 2026.3, df = NA, p-value = 0.0004998
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] pt_BR.UTF-8/pt_BR.UTF-8/pt_BR.UTF-8/C/pt_BR.UTF-8/pt_BR.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ChIPpeakAnno_3.24.1                    
##  [2] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [3] mygene_1.26.0                          
##  [4] GenomicFeatures_1.42.2                 
##  [5] BSgenome.Hsapiens.UCSC.hg19_1.4.3      
##  [6] BSgenome_1.58.0                        
##  [7] Biostrings_2.58.0                      
##  [8] XVector_0.30.0                         
##  [9] rtracklayer_1.50.0                     
## [10] AnnotationHub_2.22.0                   
## [11] BiocFileCache_1.14.0                   
## [12] dbplyr_2.1.0                           
## [13] ggrepel_0.9.1                          
## [14] edgeR_3.32.1                           
## [15] limma_3.46.0                           
## [16] ggplot2_3.3.3                          
## [17] DESeq2_1.30.1                          
## [18] SummarizedExperiment_1.20.0            
## [19] GenomicRanges_1.42.0                   
## [20] GenomeInfoDb_1.26.4                    
## [21] MatrixGenerics_1.2.1                   
## [22] matrixStats_0.58.0                     
## [23] org.Hs.eg.db_3.12.0                    
## [24] annotate_1.68.0                        
## [25] XML_3.99-0.6                           
## [26] AnnotationDbi_1.52.0                   
## [27] IRanges_2.24.1                         
## [28] S4Vectors_0.28.1                       
## [29] Biobase_2.50.0                         
## [30] BiocGenerics_0.36.0                    
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.2.1               Hmisc_4.5-0                  
##   [3] plyr_1.8.6                    lazyeval_0.2.2               
##   [5] splines_4.0.3                 BiocParallel_1.24.1          
##   [7] digest_0.6.27                 ensembldb_2.14.0             
##   [9] htmltools_0.5.1.1             fansi_0.4.2                  
##  [11] magrittr_2.0.1                checkmate_2.0.0              
##  [13] memoise_2.0.0                 cluster_2.1.1                
##  [15] askpass_1.1                   prettyunits_1.1.1            
##  [17] jpeg_0.1-8.1                  colorspace_2.0-0             
##  [19] blob_1.2.1                    rappdirs_0.3.3               
##  [21] xfun_0.22                     dplyr_1.0.5                  
##  [23] crayon_1.4.1                  RCurl_1.98-1.3               
##  [25] jsonlite_1.7.2                graph_1.68.0                 
##  [27] genefilter_1.72.1             survival_3.2-10              
##  [29] glue_1.4.2                    gtable_0.3.0                 
##  [31] zlibbioc_1.36.0               DelayedArray_0.16.2          
##  [33] scales_1.1.1                  futile.options_1.0.1         
##  [35] DBI_1.1.1                     Rcpp_1.0.6                   
##  [37] xtable_1.8-4                  progress_1.2.2               
##  [39] htmlTable_2.1.0               foreign_0.8-81               
##  [41] bit_4.0.4                     Formula_1.2-4                
##  [43] sqldf_0.4-11                  htmlwidgets_1.5.3            
##  [45] httr_1.4.2                    RColorBrewer_1.1-2           
##  [47] ellipsis_0.3.1                pkgconfig_2.0.3              
##  [49] farver_2.1.0                  nnet_7.3-15                  
##  [51] sass_0.3.1                    locfit_1.5-9.4               
##  [53] utf8_1.2.1                    tidyselect_1.1.0             
##  [55] labeling_0.4.2                rlang_0.4.10                 
##  [57] later_1.1.0.1                 munsell_0.5.0                
##  [59] BiocVersion_3.12.0            tools_4.0.3                  
##  [61] cachem_1.0.4                  gsubfn_0.7                   
##  [63] generics_0.1.0                RSQLite_2.2.4                
##  [65] evaluate_0.14                 stringr_1.4.0                
##  [67] fastmap_1.1.0                 yaml_2.2.1                   
##  [69] knitr_1.31                    bit64_4.0.5                  
##  [71] purrr_0.3.4                   AnnotationFilter_1.14.0      
##  [73] KEGGREST_1.30.1               RBGL_1.66.0                  
##  [75] mime_0.10                     formatR_1.8                  
##  [77] xml2_1.3.2                    biomaRt_2.46.3               
##  [79] compiler_4.0.3                rstudioapi_0.13              
##  [81] curl_4.3                      png_0.1-7                    
##  [83] interactiveDisplayBase_1.28.0 tibble_3.1.0                 
##  [85] geneplotter_1.68.0            bslib_0.2.4                  
##  [87] stringi_1.5.3                 futile.logger_1.4.3          
##  [89] highr_0.8                     lattice_0.20-41              
##  [91] ProtGenerics_1.22.0           Matrix_1.3-2                 
##  [93] multtest_2.46.0               vctrs_0.3.6                  
##  [95] pillar_1.5.1                  lifecycle_1.0.0              
##  [97] BiocManager_1.30.10           jquerylib_0.1.3              
##  [99] data.table_1.14.0             bitops_1.0-6                 
## [101] httpuv_1.5.5                  R6_2.5.0                     
## [103] latticeExtra_0.6-29           promises_1.2.0.1             
## [105] gridExtra_2.3                 lambda.r_1.2.4               
## [107] MASS_7.3-53.1                 assertthat_0.2.1             
## [109] chron_2.3-56                  proto_1.0.0                  
## [111] openssl_1.4.3                 withr_2.4.1                  
## [113] regioneR_1.22.0               GenomicAlignments_1.26.0     
## [115] Rsamtools_2.6.0               GenomeInfoDbData_1.2.4       
## [117] hms_1.0.0                     VennDiagram_1.6.20           
## [119] grid_4.0.3                    rpart_4.1-15                 
## [121] rmarkdown_2.7                 shiny_1.6.0                  
## [123] base64enc_0.1-3